
/*
 * Copyright (c) 2008-2009, Computational Crawling LP
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 *
 *    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 *    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 *    * Neither the name of Computational Crawling LP, 80legs, nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;

/**
 * Class CustomerResults.
 * 
 * This serializes and deserializes the customer results in 80legs
 * 
 * A simple way to get the CustomerResults is to use the static function
 * readFile(). It should be called like this: HashMap<String,byte[]> results =
 * CustomerResults.readFile ( fileName );
 * 
 * The basic deserialize flow is as follows, but you can replace the
 * BufferedInputStream with any InputStream if you want to read some other way
 * (e.g. read from memory): BufferedInputStream r = new BufferedInputStream (
 * new FileInputStream ( fileName ) ); CustomerResults results = new
 * CustomerResults(); try { results.startRead ( r ); String url; while ( (url =
 * results.readNextUrl(r)) != null ) { byte[] customerData =
 * results.readNextData ( r );
 * 
 * // do something with the url and customerData } } catch ( Exception e ) { //
 * format error }
 * 
 * For people interested in deserializing in other languages, the file format
 * this creates and reads is:
 * <classID><versionID><URL-SIZE><URL><DATA-SIZE><DATA> - the last 4 items
 * (<URL-SIZE><URL><DATA-SIZE><DATA>) repeat for each url/data pair - <classID>,
 * <versionID>, <URL-SIZE>, and <DATA-SIZE> are encoded 32-bit integers - The
 * url is encoded using UTF-8.
 * 
 */
public class CustomerResults {
	private static final int classID = 218217067;
	private static final int maxVersionID = 1;
	private static final String utfCharSet = "UTF-8";

	private byte[] useBytes; // 4-byte convenience array
	private int curVersionID;

	public CustomerResults() {
		useBytes = new byte[4];
	}

	/**
	 * This method completely reads a CustomerResults file and returns the
	 * results as a HashMap<String,byte[]> with the URL in the String and the
	 * binary results in the byte[]
	 * 
	 * Warning: The HashMap returned by this function can be quite large. If you
	 * want to be able to read a large file that cannot fit into memory, you can
	 * use the code below as a model. Just replace the HashMap stuff with your
	 * own code to process the results.
	 * 
	 * Note that this function is static. It should be called like this:
	 * HashMap<String,byte[]> results = CustomerResults.readFile ( fileName );
	 * 
	 * @param fileName
	 *            This is the file to be read
	 * 
	 * @return HashMap<String,byte[]> The url/byte[] pairs returned in a HashMap
	 */
	public static HashMap<String, byte[]> readFile(String fileName) throws Exception {
		BufferedInputStream r = new BufferedInputStream(new FileInputStream(fileName));
		CustomerResults customerResults = new CustomerResults();
		HashMap<String, byte[]> resultsData = new HashMap<String, byte[]>();

		try {
			// loop through all of the results and add them to the HashMap
			customerResults.startRead(r);
			String url;
			while ((url = customerResults.readNextUrl(r)) != null) {
				byte[] customerData = customerResults.readNextData(r);

				resultsData.put(url, customerData);
			}
		} catch (Exception e) {
		} finally {
			r.close();
		}

		return resultsData;
	}

	/**
	 * This method completely reads a CustomerResults from the input string and
	 * returns the results as a HashMap<String,byte[]> with the URL in the
	 * String and the binary results in the byte[]
	 * 
	 * Warning: The HashMap returned by this function can be quite large. If you
	 * want to be able to read a large file that cannot fit into memory, you can
	 * use the code below as a model. Just replace the HashMap stuff with your
	 * own code to process the results.
	 * 
	 * Note that this function is static. It should be called like this:
	 * HashMap<String,byte[]> results = CustomerResults.readFile ( fileName );
	 * 
	 * @param inData
	 *            This is the string to be read
	 * 
	 * @return HashMap<String,byte[]> The url/byte[] pairs returned in a HashMap
	 */
	public static HashMap<String, byte[]> readString(String inData) throws Exception {
		byte[] bytes = inData.getBytes("UTF-8");
		ByteArrayInputStream s = new ByteArrayInputStream(bytes);
		CustomerResults customerResults = new CustomerResults();
		HashMap<String, byte[]> resultsData = new HashMap<String, byte[]>();

		// loop through all of the results and add them to the HashMap
		customerResults.startRead(s);
		String url;
		while ((url = customerResults.readNextUrl(s)) != null) {
			byte[] customerData = customerResults.readNextData(s);

			resultsData.put(url, customerData);
		}

		return resultsData;
	}

	/**
	 * This method initializes a new read from a CustomerResults file. It
	 * validates the classID and versionID.
	 * 
	 * @param r
	 *            This is a InputStream for the input of the data
	 * 
	 */
	public void startRead(InputStream r) throws Exception {
		// read and validate the classID - this is used to partially validate
		// that this is the right file type.
		r.read(useBytes);
		int curClassID = byteArrayToInt(useBytes, 0);
		if (curClassID != classID) {
			throw (new Exception("Bad classID=" + curClassID + ", should be " + classID + ". This is probably not a valid CustomerResults file."));
		}

		// read and validate the versionID - this is used internally to handle
		// different versions of this file
		r.read(useBytes);
		curVersionID = byteArrayToInt(useBytes, 0);
		if (curVersionID > maxVersionID) {
			throw (new Exception("Bad Version Code=" + curVersionID + ", maxVersionID=" + maxVersionID
					+ ". This is either not a valid file or a newer version of CustomerResults.java is available on the 80legs website"));
		}
	}

	/**
	 * This method initializes a new write to a new file. It writes the format
	 * classID and versionID.
	 * 
	 * @param w
	 *            This is a OutputStream for the output of the data
	 * 
	 */
	public void startWrite(OutputStream w) throws Exception {
		intToByteArray(classID, useBytes, 0);
		w.write(useBytes);

		curVersionID = maxVersionID;
		intToByteArray(curVersionID, useBytes, 0);
		w.write(useBytes);
	}

	/**
	 * This method writes a single url/data pair to an output file
	 * 
	 * @param w
	 *            This is a OutputStream for the output of the data
	 * @param url
	 *            The URL to be encoded
	 * @param customData
	 *            The customerData to be encoded
	 * 
	 * @return the size of this write
	 */
	public int writeResult(OutputStream w, String url, byte[] customData) throws UnsupportedEncodingException, IOException {

		int totalBytes = 0;
		totalBytes += writeBytesAndSize(w, url.getBytes(utfCharSet));
		totalBytes += writeBytesAndSize(w, customData);

		return totalBytes;
	}

	/**
	 * This method reads a single url from a CustomerResults file
	 * 
	 * @param r
	 *            This is a InputStream for the input of the data
	 * 
	 * @return the URL
	 */
	public String readNextUrl(InputStream r) throws Exception {
		if (curVersionID == 1) {
			if (r.available() < 4)
				return null;

			byte[] urlBytes = readBytesAndSize(r);
			return new String(urlBytes, 0, urlBytes.length, utfCharSet);
		} else {
			throw (new Exception("Unknown versionID=" + curVersionID));
		}
	}

	/**
	 * This method reads a single byte[] from a CustomerResults file
	 * 
	 * @param r
	 *            This is a InputStream for the input of the data
	 * 
	 * @return the customer data as a byte[]
	 */
	public byte[] readNextData(InputStream r) throws Exception {
		if (curVersionID == 1) {
			if (r.available() < 4)
				return null;

			return readBytesAndSize(r);
		} else {
			throw (new Exception("Unknown versionID=" + curVersionID));
		}
	}

	/*
	 * private members below
	 */

	/*
	 * reads the size as a 32-bit integer then reads and returns that number of
	 * bytes
	 */
	private byte[] readBytesAndSize(InputStream r) throws Exception {
		r.read(useBytes);
		int size = byteArrayToInt(useBytes, 0);
		if (size < 0) {
			throw (new Exception("Bad Size"));
		}

		byte[] b = new byte[size];
		r.read(b);
		return b;
	}

	/*
	 * writes the size as a 32-bit integer then writes the bytes
	 */
	private int writeBytesAndSize(OutputStream w, byte[] b) throws IOException {
		intToByteArray(b.length, useBytes, 0);
		w.write(useBytes);
		w.write(b);

		return useBytes.length + b.length;
	}

	/*
	 * convert an int to a byte array in little endian
	 */
	private static void intToByteArray(int i, byte[] b, int byteStart) {
		b[byteStart + 3] = (byte) ((i >> 24) & 0xFF);
		b[byteStart + 2] = (byte) ((i >> 16) & 0xFF);
		b[byteStart + 1] = (byte) ((i >> 8) & 0xFF);
		b[byteStart + 0] = (byte) (i & 0xFF);
	}

	/*
	 * convert an byte array to an int (assumes byte-array was little endian)
	 */
	private static int byteArrayToInt(byte[] b, int byteStart) {
		return ((int) b[byteStart + 3] << 24) | (((int) b[byteStart + 2] & 0xFF) << 16) | (((int) b[byteStart + 1] & 0xFF) << 8)
				| ((int) b[byteStart + 0] & 0xFF);
	}
}
